suppressPackageStartupMessages({
library(GenomicRanges)
library(epiwraps)
library(ggplot2)
library(rGREAT) # Gene Ontology enrichment among genomic regions
})
## Warning: package 'GenomeInfoDb' was built under R version 4.3.3
## Warning: package 'rGREAT' was built under R version 4.4.0
options(timeout = 6000)
download.file("https://ethz-ins.org/content/w10.assignment.zip", "w10.assignment.zip")
unzip("w10.assignment.zip")
list.files()
… to illustrate the relationship between the binding of the different proteins
tracks <- list.files(pattern="bw$") # = binding intensities for all 3 CREBs
peaks <- list.files(pattern="bed$")
# we first import the peaks
peaks <- lapply(peaks, rtracklayer::import.bed)
# we'll focus on the high-quality peaks
peaks <- lapply(peaks, FUN=function(x) x[x$score>800])
# we get the union of non-redundant regions
regions <- reduce(unlist(GRangesList(peaks)))
#regions_CREB1 <- rtracklayer::import.bed("Creb1.bed") # = Genomic locations for each CREB
#regions_CREB3 <- rtracklayer::import.bed("Creb3.bed")
#regions_CREB3L1 <- rtracklayer::import.bed("Creb3L1.bed")
ese <- signal2Matrix(tracks, regions, extend=2000) # Expression set
## Reading Creb1.bw
## Reading Creb3.bw
## Reading Creb3L1.bw
plotEnrichedHeatmaps(ese, use_raster=FALSE)
ese2 <- ese[1:1000,] # select the first 1000 rows from the ese object while keeping all columns
plotEnrichedHeatmaps(ese2, cluster_rows = TRUE, show_row_dend=TRUE ) # gives activities of different TFs across the same genomic regions
set.seed(123) # to ensure that it gives the same results everytime
cl <- clusterSignalMatrices(ese, k=4) # dividing data into k=4 clusters
## ~66% of the variance explained by clusters
table(cl) # states how many items are in each cluster
## cl
## 1 2 3 4
## 901 198 450 720
head(cl)
## [1] 4 1 1 1 1 1
## Levels: 1 2 3 4
length(cl)
## [1] 2269
length(regions)
## [1] 2269
# to make sure the cluster labels stay associated with the corresponding regions/rows
# even if we manipulate the object, put them inside the rowData of the object:
rowData(ese)$cluster <- cl
head(rowData(ese))
## DataFrame with 6 rows and 1 column
## cluster
## <factor>
## chr1:778487-779069 4
## chr1:904436-904985 1
## chr1:941785-942096 1
## chr1:943126-943501 1
## chr1:959088-959463 1
## chr1:961148-961904 1
Clusters 1 and 4 contain most of the regions, capturing about 70% of the dataset’s variability. This suggests the clustering highlights key patterns, helping identify regions with similar regulatory characteristics or functions, crucial for further genomic analysis and research.
mycolors <- c("1"="red", "2"="blue", "3"="darkgreen", "4"="black")
plotEnrichedHeatmaps(ese, row_split="cluster", mean_color=mycolors, colors=c("white","darkred"), use_raster=FALSE)
Trying different numbers of clusters:
cl2 <- clusterSignalMatrices(ese, k=2:10)
ggplot(cl2$varExplained, aes(k, varExplained)) + geom_line()
Plotting just the averages:
d <- meltSignals(ese, splitBy=cl)
ggplot(d, aes(position, mean, colour=sample)) + geom_line(size=1.2) + facet_wrap(~split)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Interpretation of clusters ##
The graphs show that all three transcription factors have peaks at the center, indicating specific and localized binding sites on the genome. Creb1 has a sharp, intense peak in cluster 1, suggesting strong and specific binding. In contrast, Creb3 and Creb3L1 exhibit broader peaks, indicating less specific binding across wider genomic regions. This suggests Creb1 may play a more direct regulatory role, while Creb3 and Creb3L1 modulate gene expression more broadly or cooperatively. The overlapping peaks also hint at potential competitive or cooperative interactions among these factors, affecting similar genomic functions or regulatory pathways.
Find what's enriched in one cluster with respect to the others:
### Cluster 1
``` r
# we first split the regions by cluster:
split_regions <- split(rowRanges(ese), rowData(ese)$cluster)
lengths(split_regions)
## 1 2 3 4
## 901 198 450 720
res1 <- great(split_regions[["1"]], gene_sets="GO:BP", tss_source="hg38", # "Genomic Regions Enrichment of Annotations Tool"
background=regions, cores=2)
## * TSS source: TxDb.
## * check whether TxDb package 'TxDb.Hsapiens.UCSC.hg38.knownGene' is installed.
## * gene ID type in the extended TSS is 'Entrez Gene ID'.
## * restrict chromosomes to 'chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10,
## chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX,
## chrY, chrM'.
## * 18104/30601 protein-coding genes left.
## * update seqinfo to the selected chromosomes.
## * TSS extension mode is 'basalPlusExt'.
## * construct the basal domains by extending 5000bp to upstream and 1000bp to downsteram of TSS.
## * calculate distances to neighbour regions.
## * extend to both sides until reaching the neighbour genes or to the maximal extension.
## * use GO:BP ontology with 15709 gene sets (source: org.Hs.eg.db).
## * check gene ID type in `gene_sets` and in `extended_tss`.
## * use self-defined background regions.
## * remove excluded regions from background.
## * overlap `gr` to background regions (based on midpoint).
## * in total 901 `gr`.
## * overlap extended TSS to background regions.
## * check which genes are in the gene sets.
## * only take gene sets with size >= 5.
## * in total 2538 gene sets.
## * overlap `gr` to every extended TSS.
## * perform binomial test for each biological term.
bp1 <- getEnrichmentTables(res1)
head(bp1)
## id description genome_fraction
## 1 GO:0009653 anatomical structure morphogenesis 0.26680112
## 2 GO:0030182 neuron differentiation 0.12904307
## 3 GO:0048699 generation of neurons 0.13436450
## 4 GO:0030030 cell projection organization 0.12373319
## 5 GO:0032989 cellular component morphogenesis 0.07208059
## 6 GO:0030154 cell differentiation 0.38155624
## observed_region_hits fold_enrichment p_value p_adjust mean_tss_dist
## 1 307 1.277103 5.890274e-07 0.001369489 93674
## 2 165 1.419137 2.505025e-06 0.002438631 102832
## 3 170 1.404234 3.165546e-06 0.002438631 101428
## 4 158 1.417249 4.640603e-06 0.002438631 95375
## 5 102 1.570569 5.577920e-06 0.002438631 98727
## 6 408 1.186798 7.483697e-06 0.002438631 99646
## observed_gene_hits gene_set_size fold_enrichment_hyper p_value_hyper
## 1 155 206 1.202493 2.161035e-05
## 2 83 110 1.205879 2.018515e-03
## 3 86 115 1.195140 2.669991e-03
## 4 82 108 1.213412 1.553507e-03
## 5 49 62 1.263056 3.530111e-03
## 6 244 333 1.171019 1.736051e-06
## p_adjust_hyper
## 1 0.006641778
## 2 0.102022744
## 3 0.108907544
## 4 0.085569233
## 5 0.122814736
## 6 0.002018160
We plot the Processes:
ggplot(head(bp1,15), aes(fold_enrichment, reorder(description, p_adjust),
size=observed_region_hits, color=-log10(p_adjust))) +
geom_point() + scale_color_viridis_c()
Interpretation The enrichment analysis of cluster 1 reveals that developmental processes, particularly neuronal development, are significantly associated with CREB transcription factors in these genomic regions. CREB1 shows the sharpest peak, suggesting its key role, while CREB3 also has a strong signal, indicating potential cooperative or competitive interactions. CREB3L1 appears to play a less prominent role.
# we first split the regions by cluster:
split_regions <- split(rowRanges(ese), rowData(ese)$cluster)
lengths(split_regions)
## 1 2 3 4
## 901 198 450 720
res4 <- great(split_regions[["4"]], gene_sets="GO:BP", tss_source="hg38",
background=regions, cores=2)
## * TSS source: TxDb.
## * extended_tss is already cached, directly use it.
## * use GO:BP ontology with 15709 gene sets (source: org.Hs.eg.db).
## * check gene ID type in `gene_sets` and in `extended_tss`.
## * use self-defined background regions.
## * remove excluded regions from background.
## * overlap `gr` to background regions (based on midpoint).
## * in total 720 `gr`.
## * overlap extended TSS to background regions.
## * check which genes are in the gene sets.
## * only take gene sets with size >= 5.
## * in total 2538 gene sets.
## * overlap `gr` to every extended TSS.
## * perform binomial test for each biological term.
bp4 <- getEnrichmentTables(res4)
head(bp4)
## id description genome_fraction observed_region_hits
## 1 GO:0006396 RNA processing 0.11511954 111
## 2 GO:0008380 RNA splicing 0.05435198 56
## 3 GO:0043604 amide biosynthetic process 0.07454532 72
## 4 GO:0043043 peptide biosynthetic process 0.06719469 65
## 5 GO:0006412 translation 0.06686049 64
## 6 GO:0006518 peptide metabolic process 0.07368049 69
## fold_enrichment p_value p_adjust mean_tss_dist observed_gene_hits
## 1 1.339188 0.000961458 1 38496 76
## 2 1.431002 0.005203136 1 16697 39
## 3 1.341466 0.007441516 1 29793 49
## 4 1.343525 0.010415020 1 32046 44
## 5 1.329468 0.013477481 1 32535 43
## 6 1.300661 0.016366483 1 30223 48
## gene_set_size fold_enrichment_hyper p_value_hyper p_adjust_hyper
## 1 103 1.398909 4.647055e-06 0.0006449448
## 2 48 1.540411 2.880108e-05 0.0029452895
## 3 73 1.272584 7.552374e-03 0.1686696766
## 4 64 1.303425 5.772122e-03 0.1503974163
## 5 63 1.294020 7.771523e-03 0.1696637061
## 6 72 1.263927 9.939960e-03 0.1983259943
We plot the top Biological Processes:
ggplot(head(bp4,15), aes(fold_enrichment, reorder(description, p_adjust),
size=observed_region_hits, color=-log10(p_adjust))) +
geom_point() + scale_color_viridis_c()
library(knitr)
cluster_summary <- data.frame(
Cluster = c("Cluster 1", "Cluster 4"),
Key_Biological_Processes = c("Developmental processes, especially neuronal development",
"Glycosaminoglycan metabolic processes, amide metabolic processes"),
Region_Hits = c("Significant association", "Amide metabolic processes have most hits"),
Transcription_Factor_Peaks = c("Creb1: Sharp, intense peak. Creb3: Strong signal. Creb3L1: Less significant",
"Creb3L1: Strongest signal"),
Implications = c("Creb1 may play a dominant role in gene regulation. Creb3 may cooperate or compete. Creb3L1 has a lesser role.",
"Creb3L1 likely plays a key role in these genes and processes. Glycosaminoglycan metabolism is highly enriched.")
)
# Use kable to create a nicely formatted table
kable(cluster_summary, caption = "Summary of Key Points for Each Cluster")
| Cluster | Key_Biological_Processes | Region_Hits | Transcription_Factor_Peaks | Implications |
|---|---|---|---|---|
| Cluster 1 | Developmental processes, especially neuronal development | Significant association | Creb1: Sharp, intense peak. Creb3: Strong signal. Creb3L1: Less significant | Creb1 may play a dominant role in gene regulation. Creb3 may cooperate or compete. Creb3L1 has a lesser role. |
| Cluster 4 | Glycosaminoglycan metabolic processes, amide metabolic processes | Amide metabolic processes have most hits | Creb3L1: Strongest signal | Creb3L1 likely plays a key role in these genes and processes. Glycosaminoglycan metabolism is highly enriched. |